*************************************************
* Purpose -- To clean downloaded QCEW annual data
* for analysis
************************************************
clear all
* RUN 00_path_master.do FIRST TO GET FILEPATHS

************************************************
* Keeping data we need, appending and reshaping
************************************************

* keeping county aggregate employment data and county-wise data for restaurant sector
* for codes, see - https://www.bls.gov/cew/classifications/aggregation/agg-level-titles.htm
import delimited using "$raw/qcew/1990.annual.singlefile.csv", clear
keep if (agglvl_code == 70) | ///
(agglvl_code == 75 & industry_code == "722")
tostring disclosure_code, replace
assert disclosure_code == "."
replace disclosure_code = ""

forvalues year = 1991/2019 {
	display "Working with `year'"
	preserve
		import delimited using "$raw/qcew/`year'.annual.singlefile.csv", clear
		keep if (agglvl_code == 70) | ///
		(agglvl_code == 75 & industry_code == "722")
		
		if `year' <= 2000 {
			tostring disclosure_code, replace
			assert disclosure_code == "."
			replace disclosure_code = ""
		}
		
		tempfile year
		save `year'
	restore
	
	append using `year'
}

drop if (mi(annual_avg_emplvl) | mi(total_annual_wages)) & inrange(own_code,1,4) 
bys area_fips year agglvl_code: egen own_code_max = max(own_code)

foreach var in annual_avg_estabs annual_avg_emplvl total_annual_wages {
	replace `var' = . if own_code_max!=5 & agglvl_code == 75
}

sort area_fips year agglvl_code own_code, stable
collapse (sum) annual_avg_estabs annual_avg_emplvl total_annual_wages (last) disclosure_code, by(area_fips year agglvl_code) 


* adding county names
preserve
	import excel using "$raw/qcew/area-titles-xlsx.xlsx", firstrow clear
	tempfile names
	save `names'
restore

merge m:1 area_fips using `names', assert(2 3) keep(3) nogen

* destringing area identifiers

g countyfips = real(area_fips)
g statefips = int(real(substr(area_fips,1,2)))

* saving an intermediate version so we don't have to run the time-consuming code above again
save "$raw/qcew/county_annual_appended.dta", replace
*/
use "$raw/qcew/county_annual_appended.dta", clear


* reshaping data to area-year observations
keep statefips countyfips year annual_avg* total_annual_wages agglvl_code disclosure_code area_title
reshape wide annual_avg* total_annual_wages disclosure_code, i(countyfips year) j(agglvl_code)

* cleaning based on Dorn document on county fips changes - https://www.ddorn.net/data/FIPS_County_Code_Changes.pdf
* also described in readme
drop if countyfips == 46113 & year == 2015
replace countyfips = 46113 if countyfips == 46102

drop if inlist(statefips,72,78,2,11,15)	//removes DC, Alaska, Hawaii
drop if strpos(area_title,"Unknown")!=0
fillin countyfips year
bys countyfips (area_title): replace area_title = area_title[_N] if mi(area_title)
bys countyfips (statefips): replace statefips = statefips[1] if mi(statefips)
xtset countyfips year
************************************************
* Merging with population data
************************************************
* manual cleaning here based on this webpage - https://seer.cancer.gov/seerstat/variables/countyattribs/ruralurban.html
* also described in detail in readme file
preserve
	use "$clean/other/county_pop_15_64.dta", clear
	replace countyfips = 8001 if countyfips == 8911
	replace countyfips = 8013 if countyfips == 8912
	replace countyfips = 8059 if countyfips == 8913
	replace countyfips = 8123 if countyfips == 8914
	
	expand 2 if countyfips == 4910 & inrange(year,1990,1993), gen(expanded)
	replace countyfips = 4012 if countyfips == 4910 & inrange(year,1990,1993) & expanded == 0
	replace countyfips = 4027 if countyfips == 4910 & inrange(year,1990,1993) & expanded == 1
	drop expanded
	
	* disp 77158/(77158 + 9045) 
	* we get 0.895
	
	replace pop = 0.895*pop if countyfips == 4027 & inrange(year,1990,1993)
	replace pop = 0.105*pop if countyfips == 4012 & inrange(year,1990,1993)
	replace pop = int(pop) if inlist(countyfips,4012,4027) & inrange(year,1990,1993)
	
	tempfile pop
	save `pop'
restore

merge 1:1 countyfips year using `pop', keep(1 3)
bys countyfips (year): egen merge_mean = mean(_merge)

keep if merge_mean == 3

drop _merge merge_mean

************************************************
* Merging with minimum wage data
************************************************
merge m:1 statefips year using "$raw/other/mw_state_annual_vz.dta", keep(1 3)

bys countyfips (year): egen merge_mean = mean(_merge)

keep if merge_mean == 3

drop _merge merge_mean

rename max_mw mw
drop *_mw

gen fed_mw = .

replace fed_mw = 3.10 if year == 1980
replace fed_mw = 3.35 if year >= 1981 & year <=1989
replace fed_mw = 3.80 if year == 1990
replace fed_mw = 4.25 if year >=1991 & year<=1995
replace fed_mw = 4.75 if year == 1996
replace fed_mw = 5.15 if year>=1997 & year<=2006
replace fed_mw = 5.85 if year == 2007
replace fed_mw = 6.55 if year == 2008
replace fed_mw = 7.25 if year>=2009

// source - https://www.dol.gov/agencies/whd/minimum-wage/history/chart


************************************************
* Merging with price index
************************************************

preserve

	import excel using "$raw/other/r-cpi-u-rs-allitems.xlsx", cellrange(A6) firstrow clear
	rename YEAR - AVG, lower
	keep year avg
	keep if inrange(year,1980,2019)
	
	* reindex to have 2023 as 100
	gen index = (avg/449.3)*100 //449.3 is index for 2023
	replace index = index/100
	
	drop avg
	tempfile cpi_rs
	save `cpi_rs'
restore

merge m:1 year using `cpi_rs', assert(2 3) keep(3) nogen

************************************************
* Merging with CZ data
************************************************

preserve
	use "$raw/jnr-cbp/cw_cty_czone_mod_AADHP.dta", clear
	rename cty_fips countyfips
	tempfile cz
	save `cz'
restore

merge m:1 countyfips using `cz', keep(1 3)
bys countyfips (year): egen merge_mean = mean(_merge)

keep if merge_mean == 3

drop _merge merge_mean

************************************************
* Balanced panel
************************************************

foreach i in 70 75 {
	gen missing`i' = mi(annual_avg_emplvl`i')
	gen suppressed`i' = disclosure_code`i' == "N"
	foreach var in annual_avg_estabs`i' annual_avg_emplvl`i' total_annual_wages`i' {
		replace `var' = . if suppressed`i' == 1
	}
	gen mis_sup`i' = missing`i' == 1 | suppressed`i' == 1
	bys countyfips (year): egen prop_mis_sup`i' = mean(mis_sup`i')
	gen balanced_full_`i' = prop_mis_sup`i' == 0
	gen balanced_90_`i' = prop_mis_sup`i' <= .1
}

drop _fillin
save "$clean/qcew/qcew_annual_clean_county.dta", replace

